clear
set more off
cap log close
set logtype text


/**
NOTE: This file must be run on the NBER server
**/

***** CHANGE TO OWN HOME DIRECTORY on NBER SERVER
global homedir "/homes/data/cens1940.work/olivetti/lsalisbu"



/*What this file does: 

(1) Run individual regressions of marital status on (1) years of education; (2) quartile dummies
(2) Record R-squared, share50 
(3) Run pseudo regressions of marital status on (1) years of education; (2) pseudo quartile dummies
(4) Simulation with name assignment from R file

*/

use $homedir/simulation_dataset_1.dta, clear

gen str first=word(name,1)
replace first=subinstr(first, ".", "", .)
replace first=trim(first)
replace first=proper(first)

gen noname=(strpos(first, "?")>0 | strpos(first, "%")>0 | strpos(first, "-")>0 | strpos(first, "!")>0 | strpos(first, "*")>0)
replace noname=1 if missing(first)

tab noname
drop if noname==1
		
egen namegrp = group(first)

gen temp = educ_attain if samp1==1
egen educ_pseudo = mean(temp), by(namegrp)
drop temp

*** Adult sample -- regression of marital status on own educational attainment, women ages 30-45

reg ever_married educ_attain if samp2==1

log using "$homedir/tableA-2.txt", replace
reg ever_married i.educ_qtile if samp2==1, vce(cluster namegrp)
log close

preserve
keep if e(sample)
keep ever_married educ_qtile namegrp
gen samp=1
save "$homedir/temp_file_1.dta", replace
restore

*** Compute mean education in "child" sample by first name



/*** Properties of name distribution -- R2 from regression of actual education on mean education by name in "child" sample; 
share of ppl with top 50 name in "child" sample */


reg educ_attain educ_pseudo if samp1==1
local rsq = e(r2)


egen count_name_samp = count(namegrp) if samp1==1, by(namegrp)
egen tag_name = tag(namegrp) if samp1==1

gsort -count_name_samp -tag_name

gen name_rank = 1 in 1
replace name_rank = name_rank[_n-1] + tag_name if samp1==1 & name_rank==.

gen top50 = name_rank<=50
count if top50==1
local n = r(N)
count if samp1==1
local d = r(N)

local share50 = `n'/`d'




qui: count if tag_name==1
qui: local n_names = r(N)

di "`n_names' in child sample"



***** Pseudo regressions ****

xtile educ_qtile_pseudo = educ_pseudo if samp2==1, nq(4)

reg ever_married educ_pseudo if samp2==1

gen pq2 = educ_qtile_pseudo==2 if educ_qtile_pseudo!=.
gen pq3 = educ_qtile_pseudo==3 if educ_qtile_pseudo!=.
gen pq4 = educ_qtile_pseudo==4 if educ_qtile_pseudo!=.

log using "$homedir/tableA-2.txt", append 
reg ever_married pq2 pq3 pq4 if samp2==1, vce(cluster namegrp)
log close

local coef_q4 = _b[pq4]


preserve
keep if e(sample)
keep ever_married educ_qtile_pseudo namegrp
rename educ_qtile_pseudo educ_qtile
gen samp = 2
append using "$homedir/temp_file_1.dta"
save "$homedir/temp_file_1.dta", replace
restore

*** Check equality of actual and pseudo quartiles

tab educ_qtile if samp2==1
tab educ_qtile_pseudo if samp2==1

gen temp = first if samp2==1 & educ_pseudo!=.
egen tag_temp = tag(temp)
count if tag_temp==1
local n_names_linked = r(N)
 
di "`n_names_linked' names are linked"

egen linked = max(tag_temp), by(namegrp)


summ educ_pseudo if samp2==1
local var_pseudo = (r(sd))^2


summ educ_pseudo if samp2==1 & linked==1


keep samp1 samp2 namegrp educ_attain educ_qtile ever_married 

foreach v in educ_attain namegrp ever_married {
count if `v'==.
drop if `v'==.
}

saveold $homedir/dataset_for_R_1.dta, replace version(12)


log using "$homedir/tableA-4.txt", replace

di "Coeff on pseudo 4th quartile is `coef_q4'"
di "Variance of pseudo education is `var_pseudo'"
di "R2 = `rsq'"
di "Share with top 50 name is `share50'"

log close

use "$homedir/temp_file_1.dta", clear

gen q1 = educ_qtile==1
gen q2 = educ_qtile==2
gen q3 = educ_qtile==3
gen q4 = educ_qtile==4

gen pseudo = samp==2

forvalues i=1/4 {
	gen pseudo_q`i' = q`i'*pseudo
}

log using "$homedir/tableA-2.txt", append 

reg ever_married q2 q3 q4 pseudo pseudo_q2 pseudo_q3 pseudo_q4, vce(cluster namegrp)


**** 2nd quartile ratio: beta_1/(beta_1 + beta_5)
matrix b = e(b)
matrix vcv=e(V)

local rat2 = b[1,1]/(b[1,1] + b[1,5])

**** RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 2
di `rat2'
local multa = (b[1,5])/(b[1,1] + b[1,5])^2
local multb = (-b[1,1])/(b[1,1] + b[1,5])^2

local var2 = ((`multa')^2)*vcv[1,1] + ((`multb')^2)*vcv[5,5] + 2*`multa'*`multb'*vcv[5,1]
local se2 = sqrt(`var2')
** STANDARD ERROR OF RATIO QUARTILE 2
di `se2'

**** CONFIDENCE INTERVAL FOR RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 2
di `rat2' - 1.96*`se2'
di `rat2' + 1.96*`se2'

***** 3rd quartile ratio: beta_2/(beta_2 + beta_6)

local rat3 = b[1,2]/(b[1,2] + b[1,6])

**** RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 3
di `rat3'
local multa = (b[1,6])/(b[1,2] + b[1,6])^2
local multb = (-b[1,2])/(b[1,2] + b[1,6])^2


local var3 = ((`multa')^2)*vcv[2,2] + ((`multb')^2)*vcv[6,6] + 2*`multa'*`multb'*vcv[6,2]

local se3 = sqrt(`var3')
** STANDARD ERROR OF RATIO QUARTILE 3
di `se3'

**** CONFIDENCE INTERVAL FOR RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 3
di `rat3' - 1.96*`se3'
di `rat3' + 1.96*`se3'

**** 4th quartile ratio: beta_3/(beta_3 + beta_7)

local rat4 = b[1,3]/(b[1,3] + b[1,7])
**** RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 4
di `rat4'
local multa = (b[1,7])/(b[1,3] + b[1,7])^2
local multb = (-b[1,3])/(b[1,3] + b[1,7])^2


local var4 = ((`multa')^2)*vcv[3,3] + ((`multb')^2)*vcv[7,7] + 2*`multa'*`multb'*vcv[7,3]

local se4 = sqrt(`var4')
** STANDARD ERROR OF RATIO QUARTILE 4
di `se4'

**** CONFIDENCE INTERVAL FOR RATIO OF TRUE TO PSEUDO COEFFICIENT ON QUARTILE 4
di `rat4' - 1.96*`se4'
di `rat4' + 1.96*`se4'

log close

erase "$homedir/temp_file_1.dta"
